Data analysis 1

Modeling

library("tidyverse")
library("broom")
library("ggrepel")
augmented_table <- read_tsv("../data/03_augmented_table.tsv")
Rows: 2570560 Columns: 17
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (10): GeneFeature, Sample_names, Patient, IDH_status, Localisation, Prim...
dbl  (7): Quantile, CPM, GEP_score, Age, Batch, TLS_status_bin, CPM_log2

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
augmented_metadata <- read_tsv("../data/03_augmented_metadata.tsv")
Rows: 64 Columns: 13
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (9): Sample_names, Patient, IDH_status, Localisation, Primary_or_Recurre...
dbl (4): GEP_score, Age, Batch, TLS_status_bin

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#MODEL 1: CPM log2 transformed ~ TLS_status
nested_data <- augmented_table |>
  group_by(GeneFeature) |>
  nest()  |>
  mutate(
    model = map(data, ~lm(CPM_log2 ~ TLS_status_bin, data = .x)),
    tidy_model = map(model, ~tidy(.x, conf.int = TRUE))
  )
gene_results <- nested_data |>
  unnest(tidy_model) |>
  filter(term == "TLS_status_bin") |>
  select(GeneFeature, estimate, p.value, conf.low, conf.high) |>
  mutate(
    q.value = p.adjust(p.value, method = "fdr"),
    significant = q.value < 0.01
  )
#MODEL 2: GEP_score ~ TLS_status
gep_model <- lm(GEP_score ~ TLS_status_bin, data = augmented_metadata)
gep_model_tidy <- tidy(gep_model, conf.int = TRUE)
#Forest plot for significant genes (CPM ~ TLS_status)
forest_plot <- gene_results |>
  filter(significant == TRUE) |>
  mutate(GeneFeature = fct_reorder(GeneFeature, estimate)) |>
  ggplot(aes(x = estimate, y = GeneFeature)) +
  geom_point() +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), height = 0.2) +
  geom_vline(xintercept = 0, linetype = "dashed") +
  labs(
    title = "Differential Expression by TLS Status (CPM)",
    subtitle = "Forest plot of statistically significant genes (FDR < 0.01)",
    x = "Effect size (Estimate ± 95% CI)",
    y = NULL
  ) +
  theme_minimal()

#Volcano plot for all genes
volcano_plot <- gene_results |>
  mutate(
    neg_log10_p = -log10(p.value),
    label = if_else(significant, GeneFeature, "")
  ) |>
  ggplot(aes(x = estimate, y = neg_log10_p, color = significant)) +
  geom_point(alpha = 0.6) +
  geom_text_repel(aes(label = label), size = 2, max.overlaps = 20) +
  geom_hline(yintercept = 0, linetype = "dotted") +
  labs(
    title = "Volcano Plot of Gene Effects (CPM ~ TLS_status)",
    x = "Effect size (Estimate)",
    y = "-log10(p-value)",
    color = "Significant"
  ) +
  theme_minimal()
#Print forest plot
forest_plot

#Print volcano plot
volcano_plot
Warning: Removed 6543 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 6543 rows containing missing values or values outside the scale range
(`geom_text_repel()`).
Warning: ggrepel: 1065 unlabeled data points (too many overlaps). Consider
increasing max.overlaps